home *** CD-ROM | disk | FTP | other *** search
-
- /* Copyright (c) CNIDR (Work in progress) */
-
- /* WIDE AREA INFORMATION SERVER SOFTWARE
- No guarantees or restrictions. See the readme file for the full standard
- disclaimer.
- Brewster@think.com
- */
-
-
- /* implements the search part of irext.h
- (search_word and finished_search_word)
- -brewster
-
- Split from irsearch.c
-
- 5/31/91 Added scale_scores. Fixed document_score_array to long.
- 7/8/91 Removed scale_scores, handled in search_word with doc_id > 0.
- 2/4/92 Made document_score_array a double.
-
- - Jonny G
- * $Log: sersrch.c,v $
- * Revision 1.54 1994/12/13 17:03:58 pfeifer
- * *** empty log message ***
- *
- * Revision 1.53 1994/11/14 15:58:17 pfeifer
- * Patch by Archie Warnoc in c.i.w (must be made size dependent?)
- *
- * Revision 1.52 1994/09/06 16:53:48 pfeifer
- * Syn cache patch
- *
- * Revision 1.51 1994/08/05 09:46:46 pfeifer
- * No more 'MAXINT redefined' complaints.
- *
- * Revision 1.50 1994/08/05 07:12:38 pfeifer
- * Release beta 04
- *
- * Revision 1.49 1994/07/13 07:52:36 huynh1
- * Uli
- *
- * Revision 1.48 1994/05/27 09:13:21 huynh1
- * boolean code updated. beta
- *
- * Revision 1.47 1994/05/26 14:33:57 huynh1
- * search_word updated (read_weight_from_stream).
- * beta.
- *
- * Revision 1.46 1994/05/20 12:49:58 pfeifer
- * beta
- *
- * Revision 1.45 1994/05/19 12:44:39 huynh1
- * search_word updated.
- *
- * Revision 1.44 1994/05/18 17:28:13 huynh1
- * new term weighting
- * higher retrieval quality.
- *
- * Revision 1.40 1994/04/28 16:28:01 huynh1
- * stemming
- *
- * Revision 1.39 1994/04/06 23:52:04 huynh1
- * 08, autoconf, Uli
- *
- * Revision 1.38 1994/03/23 13:11:07 pfeifer
- * removed include iso.h
- *
- * Revision 1.37 1994/03/08 20:46:12 huynh1
- * Patchlevel 04
- *
- * Revision 1.36 1994/02/14 10:33:04 huynh1
- * new code for field concept added.
- *
- * Revision 1.36 1993/12/08 17:38:00 huynh1
- * bug by mixing literal and nested boolean corrected!
- *
- * Revision 1.10 1993/10/13 14:14:20 huynh1
- * new code added for encapsulated boolean queries and
- * modified literal search
- *
- * Revision 1.3 1993/07/13 08:19:56 pfeifer
- * Sicherung vor Aenderungen Tung
- *
- * Revision 1.1 1993/02/16 15:05:35 freewais
- * Initial revision
- *
- * Revision 1.24 92/04/28 16:56:54 morris
- * added boolean to serial engine
- *
- * Revision 1.23 92/03/15 10:15:18 jonathan
- * Added Simon Spero's ASSIGN replacement for read_bytes.
- *
- * Revision 1.22 92/03/05 07:09:54 shen
- * add two more dummy arguments to call to init_search_engine
- *
- * Revision 1.21 92/02/12 17:29:52 jonathan
- * Conditionalized inclusion of object code.
- *
- * Revision 1.20 92/02/12 13:40:06 jonathan
- * Added "$Log" so RCS will put the log message in the header
- *
- */
-
- #include "cutil.h"
- #include "irfiles.h"
- #ifdef BIO
- #include "irtfiles.h" /* dgg, for wordDelimiter */
- #endif
- #include "irsearch.h"
- #include "irext.h"
- #include "byte_order.h"
- /* #include <string.h> */
- #include <ctype.h>
-
- #include <math.h>
-
- #ifdef MAXINT
- #undef MAXINT
- #endif
- #define MAXINT (unsigned long)2^(sizeof(long)*8-1)
- #define VALUE 1000000L
- /* francois */
- #include "stemmer.h"
-
- /* tung, 10/93 */
- #ifdef NESTED_BOOLEANS
- #include "boolean_op.h"
- #endif
- /* tung, 10/93 */
-
- #ifdef FIELDS /* tung, 1/94 */
- #include "field_search.h"
- #endif
-
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- #include "weight.h"
- #endif
-
- #ifdef BOOL
- #include "obj.h"
- #include "irparse.h"
- object* currentQuery = NULL; /* kludge until irext goes away */
- #endif /* def BOOL */
-
- /* weighting for relevant document terms -
- this may become a parameter to the query.
- */
-
- #define RF_WEIGHTING 0.1
-
- /* ==================================
- * === Initialization Functions ===
- * ==================================*/
-
-
- long init_search_engine(file, initialize, for_search, cm_mem_percent,
- text_size, grow_percent)
- char* file;
- boolean initialize;
- boolean for_search;
- long cm_mem_percent; /* unused */
- long text_size; /* unused */
- long grow_percent; /* unused */
- {
- static boolean inited = false;
-
- if (inited == false)
- {
- #ifdef BOOL
- initObj();
- initBool();
- #endif
- inited = true;
- }
-
- return(0);
- }
-
- long finished_search_engine()
- {
- #ifdef CACHE_SYN
- /* clean up shared memory segments */
- if (cacheSynId) {
- int i;
- char *pcs;
- t_cacheSyn *syn_Cache, *cs;
- if ((syn_Cache = (t_cacheSyn *) shmat (cacheSynId, 0, 0)) !=
- ((t_cacheSyn *)-1)) {
- pcs = (char *) syn_Cache;
- for (i = 0, cs = (t_cacheSyn *) pcs; i < MAX_SYN_CACHE && cs->id;
- i++, pcs += sizeof(t_cacheSyn), cs = (t_cacheSyn *) pcs)
- if (shmctl(cs->id, IPC_RMID, (t_cacheSyn *)0) < 0)
- waislog (WLOG_HIGH, WLOG_WARNING, "Error detatching shared memory segment (id=%d)", cs->id);
- if (shmctl(cacheSynId, IPC_RMID, (t_cacheSyn *)0) < 0)
- waislog (WLOG_HIGH, WLOG_WARNING, "Error detatching shared memory segment (id=%d)", cacheSynId);
- }
- }
- #endif
- return(0);
- }
-
-
- /*
- * ext_open_database: see irext.h
- */
-
- long ext_open_database (db, initialize, for_search)
- database *db;
- boolean initialize;
- boolean for_search;
- { /* this has to deal with the .inv file */
- char file[MAX_FILE_NAME_LEN];
-
- if(initialize) /* make a new one */
- db->index_stream = s_fopen(index_filename(file, db), "w+b");
- else if(for_search) /* just search */
- db->index_stream = s_fopen(index_filename(file, db), "rb");
- else /* write to an existing db */
- db->index_stream = s_fopen(index_filename(file, db), "r+b");
-
- if (db->index_stream == NULL) {
- waislog(WLOG_HIGH, WLOG_ERROR,"2can't open the inverted index file %s\n",
- file);
- disposeDatabase(db);
- return(1);
- }
- return(0);
- }
-
-
-
- /*
- * ext_close_database: see irext.h
- */
-
- long ext_close_database (db)
- database *db;
- {
- return(0);
- }
-
- char *database_file(database_name)
- char *database_name;
- {
- return(database_name);
- }
-
- /*===========================*
- *=== Setting Paramters ===*
- *===========================*/
-
- long max_hit_retrieved = 0;
- char **srcs = NULL;
-
- long set_query_parameter (mask, parameters)
- long mask;
- query_parameter_type * parameters;
- {
- switch (mask)
- {
- case SET_MAX_RETRIEVED_MASK:
- max_hit_retrieved = parameters->max_hit_retrieved;
- return(0);
- break;
- case SET_SELECT_SOURCE:
- if(NULL != srcs){
- if(NULL != srcs[0])
- s_free(srcs[0]);
- s_free(srcs);
- }
- srcs = parameters->srcs;
- break;
- default:
- return(-1);
- break;
- }
- return(0);
- }
-
- /*==============================*
- *=== Document Score Array ===*
- *==============================*/
-
- double *document_score_array = NULL;
- long document_score_array_len = 0;
- #ifdef NESTED_BOOLEANS /* tung, 1/94 */
- double *NumPart_score_array = NULL;
- #else
- #ifdef BOOLEANS
- double *prev_score_array = NULL; /* 12/91 GS TLG */
- #endif
- #endif
-
- #ifdef NESTED_BOOLEANS
- /* tung, 10/93 */
- search_result_struct *search_result_array = NULL;
- long operand_id = 0;
-
- static void clear_search_result_array _AP((long* number_of_elements));
- static void clear_search_result_array(number_of_elements)
- long* number_of_elements;
- {
- long count;
-
- if(*number_of_elements > 1 && search_result_array != NULL) {
- for(count=0; count < *number_of_elements; count++) {
- if(search_result_array[count].doc_ids_array != NULL)
- s_free(search_result_array[count].doc_ids_array);
- }
- s_free(search_result_array);
- }
- *number_of_elements = 1;
- }
-
- static void make_search_result_array _AP((long length));
- static void make_search_result_array(length)
- long length;
- {
- if(search_result_array == NULL) {
- search_result_array =
- (search_result_struct *)
- s_malloc((size_t)(length * sizeof(search_result_struct)));
- operand_id = 0;
- }
- }
-
- static boolean make_doc_ids_array _AP((long pos, long length));
- static boolean make_doc_ids_array(pos, length)
- long pos;
- long length;
- {
- /* if(search_result_array[pos].doc_ids_array == NULL) */
- search_result_array[pos].doc_ids_array =
- (doc_descr_struct *)
- s_malloc((size_t)(sizeof(doc_descr_struct) * length));
- if(search_result_array[pos].doc_ids_array == NULL) {
- waislog(WLOG_HIGH, WLOG_ERROR, "Out of memory");
- return(false);
- }
- return(true);
- }
- /* tung, 10/93 */
- #endif
-
- /* make_document_score_array insures that the document_score_array
- array is long enough, if not it makes it long enough */
- static void make_document_score_array _AP((long length ));
- static void make_document_score_array(length)
- long length;
- {
- if(length <= document_score_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(document_score_array != 0){
- s_free(document_score_array);
- #ifdef NESTED_BOOLEANS /* tung, 1/94 */
- s_free(NumPart_score_array);
- #else
- #ifdef BOOLEANS
- s_free(prev_score_array); /* 12/91 GS TLG */
- #endif
- #endif
- }
- document_score_array = (double*)s_malloc((size_t)(length * sizeof(double)));
- #ifdef NESTED_BOOLEANS /* tung, 1/94 */
- NumPart_score_array = (double*)s_malloc((size_t)(length * sizeof(double)));
- memset(NumPart_score_array, 0,
- document_score_array_len * sizeof(double));
- #else
- #ifdef BOOLEANS
- prev_score_array = (double*)s_malloc((size_t)(length * sizeof(double))); /* 12/91 GS TLG */
- #endif
- #endif
- document_score_array_len = length;
- }
-
- static void destroy_document_score_array _AP((void));
- static void destroy_document_score_array()
- {
- s_free(document_score_array);
- #ifdef NESTED_BOOLEANS /* tung, 1/94 */
- s_free(NumPart_score_array);
- #else
- #ifdef BOOLEANS
- s_free(prev_score_array); /* 12/91 GS TLG */
- #endif
- #endif
- document_score_array_len = 0;
- }
-
- void clear_document_score_array()
- /* side effects the document_score_array. */
- {
- memset(document_score_array, 0,
- document_score_array_len * sizeof(double));
- #ifdef NESTED_BOOLEANS /* tung, 1/94 */
- memset(NumPart_score_array, 0,
- document_score_array_len * sizeof(double));
- #else
- #ifdef BOOLEANS
- memset(prev_score_array, 0, /* 12/91 GS TLG */
- document_score_array_len * sizeof(double)); /* 12/91 GS TLG */
- #endif
- #endif
- }
-
- /* for debugging purposes */
- void print_document_score_array(start,stop)
- unsigned long start;
- unsigned long stop;
- /* assumes start >= 0, stop < db->doc_table_allocated_entries */
- {
- long i;
- for(i = start; i <= stop; i++){
- printf("entry number %d: %f \n",
- i, document_score_array[i]);
- }
- }
-
-
-
- /*=========================*
- *=== Best Hits Array ===*
- *=========================*/
-
- hit *best_hits_array = NULL;
- long best_hits_array_len = 0;
- long current_best_hit = 0;
- long doc_start = 0; /* tung, 5/94 */
- long doc_end = 0; /* tung, 5/94 */
-
- /* see irext.h for doc */
- long init_best_hit (db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL)
- send(currentQuery,InitBestHit,db);
- #endif /* def BOOL */
-
- return(0);
- }
-
- /* make_best_hits_array insures that the best_hits_array
- array is long enough, if not it makes it long enough */
- static void make_best_hits_array _AP((long length));
- static void make_best_hits_array(length)
- long length;
- {
- if(length <= best_hits_array_len)
- return;
- /* we have to make a new one. free the old one first (if any) */
- if(best_hits_array != 0){
- s_free(best_hits_array);
- }
- best_hits_array = (hit*)s_malloc((size_t)(length * sizeof(hit)));
- best_hits_array_len = length;
- }
-
- static void destroy_best_hits_array _AP((void));
- static void destroy_best_hits_array()
- {
- s_free(best_hits_array);
- best_hits_array_len = 0;
- }
-
- void clear_best_hits_array()
- /* side effects the best_hits_array. XXX could use memset */
- {
- memset((char*)best_hits_array, 0, best_hits_array_len * sizeof(hit));
- }
-
- /* for debugging purposes */
- void print_best_hits()
- {
- long i;
- for( i = 0; i < best_hits_array_len; i++){
- if (best_hits_array[i].weight != 0)
- { printf("Best hit %ld: weight %lf, doc_id %ld, headline %s, filename %s, lines %ld\n",
- i, best_hits_array[i].weight,
- best_hits_array[i].document_id,
- best_hits_array[i].headline,
- best_hits_array[i].filename,
- best_hits_array[i].number_of_lines);
- }
- }
- }
-
- void sort_best_hits(db)
- database * db;
- {
- /* returns nothing.
- * side effects best_hits and document_score_array
- */
-
- long i, doc;
- double worst_weight_to_make_it = 0.0;
- document_table_entry doc_entry;
- long best_hit_number = 0;
-
- /* snuff the scores */
- for(i = 0; i < max_hit_retrieved; i++){
- best_hits_array[i].weight = 0.0;
-
- }
-
- /* loop over the doc, and keep the doc_id and weight in best hit table */
- /* for(doc = 0; doc < db->doc_table_allocated_entries; doc++){ */
- for(doc = doc_start; doc <= doc_end; doc++) {
- double weight = document_score_array[doc];
- /* jmf */
- if(weight > 0) {
- #ifndef NEW_WEIGHT /* tung, 5/94 */
- read_document_table_entry(&doc_entry, doc, db); /* if this could be
- removed, we'd gain speed */
- if (doc_entry.document_length)
- weight/=doc_entry.document_length;
- else
- weight = 0;
- #endif
- if(worst_weight_to_make_it < weight){
- /* merge it into the best_hits array. start at the bottom */
- for(i = (max_hit_retrieved - 1); i >= 0; i--){
- if(weight > best_hits_array[i].weight
- /* && (check_document_id(doc, db) == true) too slow.*/
- ){
- /* move this entry down */
- if((i + 1) < max_hit_retrieved){
- best_hits_array[i+1].weight = best_hits_array[i].weight;
- best_hits_array[i+1].document_id = best_hits_array[i].document_id;
- }
- best_hits_array[i].document_id = doc;
- best_hits_array[i].weight = weight;
- }
- else
- break;
- }
- }
- }
- }
- doc_start = doc_end = 0; /* tung, 5/94 */
- for(i = 0; i < max_hit_retrieved; i++){
- if(best_hits_array[i].weight <= 0.0)
- return;
- if (read_document_table_entry(&doc_entry,
- best_hits_array[i].document_id,
- db)
- == true){
- best_hits_array[best_hit_number].weight = best_hits_array[i].weight;
-
- best_hits_array[best_hit_number].document_id = best_hits_array[i].document_id;
- best_hits_array[best_hit_number].start_character = doc_entry.start_character;
- best_hits_array[best_hit_number].end_character = doc_entry.end_character;
- best_hits_array[best_hit_number].document_length = doc_entry.document_length;
- best_hits_array[best_hit_number].number_of_lines = doc_entry.number_of_lines;
- sprintf(best_hits_array[best_hit_number].date, "%d", doc_entry.date);
- read_filename_table_entry(doc_entry.filename_id,
- best_hits_array[best_hit_number].filename,
- best_hits_array[best_hit_number].type,
- NULL,
- db),
- strncpy(best_hits_array[best_hit_number].headline,
- read_headline_table_entry(doc_entry.headline_id,db),
- MAX_HEADLINE_LEN);
- best_hit_number++;
- }
- beFriendly();
- }
- for(i = best_hit_number; i < max_hit_retrieved; i++){
- best_hits_array[best_hit_number].weight = 0.0;
- }
- /* print_best_hits(s); for debugging */
- }
-
-
- /* returns the next best hit */
- long best_hit(db, doc_id, best_character, best_line, score,start,end,date,
- length,nlines,headline,filename,type)
- database *db;
- long *doc_id;
- long *best_character;
- long *best_line;
- double *score;
- long *start,*end,*date,*length,*nlines;
- char *headline,*filename,*type;
- {
- double tmp;
-
- *best_character = 0;
- *best_line = 0;
-
- #ifdef BOOL
- if (currentQuery != NULL) /* for boolean */
- {
- send(currentQuery,GetBestHit,db,doc_id,best_character,best_line,score);
- if (*doc_id > 0)
- return(0); /* ok */
- else
- return(-1); /* no more docs */
- }
- #endif /* BOOL */
-
- if(current_best_hit > best_hits_array_len)
- return(1);
- if(best_hits_array[current_best_hit].weight == 0.0)
- return(1);
- *doc_id = best_hits_array[current_best_hit].document_id;
- tmp = ((double)(best_hits_array[current_best_hit].weight*VALUE));
- *score=tmp;
- *start=best_hits_array[current_best_hit].start_character;
- *end=best_hits_array[current_best_hit].end_character;
- *date=atol(best_hits_array[current_best_hit].date);
- *length=best_hits_array[current_best_hit].document_length;
- *nlines=best_hits_array[current_best_hit].number_of_lines;
- strcpy(headline,best_hits_array[current_best_hit].headline);
- strcpy(filename,best_hits_array[current_best_hit].filename);
- strcpy(type,best_hits_array[current_best_hit].type);
- current_best_hit++;
- return(0);
- }
-
- long finished_best_hit(db)
- database *db;
- {
-
- #ifdef BOOL
- if (currentQuery != NULL) /* for boolean */
- { send(currentQuery,Delete);
- currentQuery = NULL;
- return(0);
- }
- #endif /* BOOL */
-
- /* if we are on a small machine, we might want to
- destroy_document_score_array */
- clear_document_score_array();
- clear_best_hits_array();
- current_best_hit = 0;
- return(0);
- }
-
- /*=============================*
- *=== Searching for words ===*
- *=============================*/
-
- /* see irext.h for doc */
- long init_search_word (db)
- database* db;
- {
- char fn[256];
- strcpy( fn,db->database_file );
- strcat( fn,synonym_ext );
- syn_ReadFile( fn,&db->syn_Table,&db->syn_Table_Size );
-
- return(0);
- }
-
- #ifdef NESTED_BOOLEANS
- /* tung, 10/93 */
- extern long number_of_operands ;
- /* tung, 10/93 */
- #endif
-
- #ifdef BOOLEANS
- static boolean gLastAnd= false;
- static boolean gLastNot= false;
- #endif
-
- /* see irext.h for doc */
- long search_word(word,
- #ifdef FIELDS /* tung, 5/94 */
- field_name,
- #endif
- char_pos, line_pos, weight, doc_id,
- word_pair, db)
- char *word; /* the word to be searched for */
- #ifdef FIELDS /* tung, 5/94 */
- char *field_name;
- #endif
- long char_pos; /* the position of the start of the word */
- long line_pos; /* is this needed? not for signature system */
- long weight; /* how important the word looks syntactically,
- such as is it bold */
- long doc_id; /* current document, seed words is 0,
- then it increments into the relevant
- document */
- long word_pair;
- database *db;
- {
- /* this side effects the document_score_array,
- * and downcases the word.
- * Returns 0 if successful or word not present,
- * returns non-0 if an error.
- *
- */
-
- long not_full_flag = INDEX_BLOCK_FULL_FLAG; /* start out full so it will go on looking */
- long count, index_block_size;
- long internal_document_id, number_of_valid_entries;
- double internal_weight;
- long index_file_block_number;
- long number_of_occurances;
-
- FOUR_BYTE index_buffer_data[INDEX_ELEMENT_SIZE*(1024/4)];
- char *index_buffer;
- #ifdef undef
- char *i = index_buffer; /* What the hell should be in i ? (up) */
- #endif
- FILE *stream = NULL;
-
-
- #ifdef LITERAL
- long txt_pos, icnt, wcnt, pcnt; /* 2/92 GS TLG */
- document_table_entry doc_entry; /* 2/92 GS TLG */
- static FILE *txt_stream = NULL; /* 2/92 GS TLG */
- char cmpr_word[MAX_PHRASE_LENGTH + 1]; /* 2/92 GS TLG */
- /*char phrase[MAX_PHRASE_LENGTH + 1]; */ /* 2/92 GS TLG */
- char txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */
- char *temp_txt_filename = NULL; /* francois */
- char prev_txt_filename[MAX_FILENAME_LEN + 1]; /* 2/92 GS TLG */
- char txt_type[MAX_TYPE_LEN + 1]; /* 2/92 GS TLG */
- long phraselen= 0, txt_pos_fix= 0;
- char *document_section = NULL; /* tung , 10/93 */
- long document_section_len = 0; /* tung , 10/93 */
- long phrase_readed = 0; /* tung , 10/93 */
- long phrase_count = 0; /* tung , 10/93 */
- boolean phrase_found = false; /* tung , 10/93 */
- #endif
-
- #ifdef NESTED_BOOLEANS /* tung, 10/93 */
- long numeric_partial_valid_entries = 0;
- #endif
-
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- double query_wgt;
- #else
- double idf;
- #endif
- #ifdef FIELDS /* tung, 12/93 */
- long field_id = -1;
- boolean SearchField = false;
- #endif
-
- /* do synonym conversion */
-
- /* in theory, one can replace a word with a boolean phrase */
- char *newword;
-
- newword = lookup_Synonym( word,db->syn_Table,db->syn_Table_Size );
- waislog(WLOG_HIGH,WLOG_INFO,"Word %s Syn %s",word,newword);
- strncpy(word,newword,MAX_WORD_LENGTH);
-
- #ifdef FIELDS /* tung, 12/93 */
- if(db->number_of_fields > 0) {
- if(*field_name != '\0') {
- if(strcmp(field_name, FREE_TEXT_FIELD) == 0) { /* global database */
- field_name = "\0";
- SearchField = false;
- field_id = -1;
- } else {
- SearchField = true;
- field_id = pick_up_field_id(field_name, db);
- }
- }
- }
- #endif
-
- /* tung, 10/93 */
- #ifdef NESTED_BOOLEANS
- if(number_of_operands > 1) {
- make_search_result_array(number_of_operands);
- if((weight!=LITERAL_FLAG) && IsOperator(word)) {
- boolean_operations(word, search_result_array);
- return(0);
- }
- if(strlen(word) == 1) {
- search_result_array[operand_id].number_of_hits = 0;
- search_result_array[operand_id].operand_id = operand_id;
- if(!save_operand_id(operand_id, search_result_array, db->doc_table_allocated_entries))
- return(-1);
- ++operand_id;
- return(0);
- }
- }
- #endif
- /* tung, 10/93 */
-
- /* francois - call the stemmer */
- #ifdef FIELDS /* tung, 1/94 */
- if(weight!=LITERAL_FLAG && weight!= FIELD_FLAG && weight!= NUMERIC_FLAG) {
- #ifdef STEM_WORDS
- if(field_id > -1) {
- if(db->fields[field_id].stemming)
- stemmer(word);
- }
- else {
- if(db->stemming)
- stemmer(word);
- }
- #endif
- }
- #else
- #ifdef LITERAL
- if (weight!=LITERAL_FLAG) {
- stemmer(word);
- }
- #else
- stemmer(word);
- #endif
- #endif
-
- #ifdef LITERAL
- if (weight==LITERAL_FLAG) {
- /* goto after_booleans */
- /* printf("search_word: literal word is [%s]\n", word); */
- }
- else
- #endif
-
- #ifndef NESTED_BOOLEANS /* 10,93 */
- #ifdef BOOLEANS
- if (strcmp(word,BOOLEAN_AND)==0) { /* should be all lowercase cmp here */
- gLastAnd= true;
- return(0);
- }
- else if (strcmp(word,BOOLEAN_NOT)==0) {
- /* ^^ this is bad if we intersperse "not"s in a query --
- docs found after not word may include notted word --
- need to go back to doing not words after others --
- but need now to check for literal string first
- */
- gLastNot= true;
- return(0);
- }
- if (weight == BOOLEAN_NOT_FLAG) gLastNot= true;
- #else
- ; /* if not LITERAL_FLAG */
- #endif
- #endif /* #ifndef NESTED_BOOLEANS */
-
- index_buffer = (char*)index_buffer_data;
-
- #ifdef LITERAL
- if (weight==LITERAL_FLAG) {
- /* note: we found the first word of phrase once in map_over_words, but i'm too lazy
- to put another parameter in that cascade of function calls it takes
- to get here.
- */
- char word1[MAX_WORD_LENGTH + 1];
- register int i, len;
- register boolean more;
- phraselen= MINIMUM( MAX_PHRASE_LENGTH, strlen(word));
- len = MINIMUM( MAX_WORD_LENGTH, phraselen);
- for (i=0, more=true; i < len && more; ) {
- word1[i] = word[i++];
- #ifdef BIO
- more= (wordDelimiter(word[i]) == NOT_DELIMITER);
- #else
- more= (isalnum(word[i]));
- #endif
- }
- word1[i]= '\0';
- txt_pos_fix= strlen(word1) + 1;
- /* printf("search_word: literal word1 is [%s]\n", word1); */
- #ifdef FIELDS /* tung, 1/94 */
- if((db->number_of_fields == 0) && !SearchField)
- index_file_block_number =
- look_up_word_in_dictionary(word1, &number_of_occurances, db);
- else
- index_file_block_number =
- field_look_up_word_in_dictionary(field_name, word1, &number_of_occurances, db);
- #else
- index_file_block_number =
- look_up_word_in_dictionary(word1, &number_of_occurances, db);
- #endif
- }
- else
- #endif /* LITERAL */
-
- #ifdef PARTIALWORD
- #ifdef FIELDS /* tung, 1/94 */
- index_file_block_number =
- look_up_partialword_in_dictionary(field_name,
- word, &number_of_occurances, db);
- #else
- index_file_block_number =
- look_up_partialword_in_dictionary(word, &number_of_occurances, db);
- #endif
- #else
- index_file_block_number =
- look_up_word_in_dictionary(word, &number_of_occurances, db);
- #endif
-
- current_best_hit = 0; /* so that the best hits willstart from 0 */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- if(index_file_block_number >= 0){
- #ifdef PARTIALWORD
- while(index_file_block_number > 0){ /* dgg, need 2nd loop here for multiple partwords */
- #endif
-
- #ifdef FIELDS /* tung, 1/94 */
- if(SearchField && *field_name != '\0')
- stream = db->field_index_streams[pick_up_field_id(field_name, db)];
- else stream = db->index_stream;
- #else
- stream = db->index_stream;
- #endif
-
- while((not_full_flag != INDEX_BLOCK_NOT_FULL_FLAG) &&
- (index_file_block_number != 0)){
- /* read the index block */
- if (0 != fseek(stream, (long)index_file_block_number,
- SEEK_SET))
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "fseek failed into the inverted file to position %ld",
- (long)index_file_block_number);
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif
- return(-1);
- }
- /*
- read(fileno(stream),index_buffer,INDEX_BLOCK_HEADER_SIZE);
-
- ASSIGN(not_full_flag,
- INDEX_BLOCK_FLAG_SIZE,
- index_buffer,
- INDEX_BLOCK_HEADER_SIZE,
- 0 );
- ASSIGN(index_file_block_number,NEXT_INDEX_BLOCK_SIZE,
- index_buffer+INDEX_BLOCK_FLAG_SIZE,
- INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_FLAG_SIZE);
- ASSIGN(index_block_size,INDEX_BLOCK_SIZE_SIZE,
- index_buffer+INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE,
- INDEX_BLOCK_HEADER_SIZE,
- INDEX_BLOCK_FLAG_SIZE+NEXT_INDEX_BLOCK_SIZE);
-
- this is equivalent, but slower:
- */
- not_full_flag = read_bytes(INDEX_BLOCK_FLAG_SIZE, stream);
- index_file_block_number = read_bytes(NEXT_INDEX_BLOCK_SIZE, stream);
- index_block_size = read_bytes(INDEX_BLOCK_SIZE_SIZE, stream);
-
- /* Jim's debug code commented out
- printf("flag = %d, block_num = %d, block_size = %d\n",
- not_full_flag,
- index_file_block_number,
- index_block_size);
- */
- fflush(stdout);
-
- if(EOF == index_block_size)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the index file failed");
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif
- return(-1);
- }
-
- if(not_full_flag == INDEX_BLOCK_NOT_FULL_FLAG){
- /* not full */
- number_of_valid_entries = index_file_block_number;
- }
- else if(not_full_flag == INDEX_BLOCK_FULL_FLAG){
- /* full */
- number_of_valid_entries = index_block_size - INDEX_BLOCK_HEADER_SIZE;
- }
- else{ /* bad news, file is corrupted. */
- waislog(WLOG_HIGH, WLOG_ERROR,
- "Expected the flag in the inverted file to be valid. it is %ld",
- not_full_flag);
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif
- return(-1);
- }
- /* printf("number of valid bytes: %ld\n", number_of_valid_entries); */
-
- /* add the array to the document_score_array */
- number_of_valid_entries /= INDEX_ELEMENT_SIZE;
-
- /* tung, 10/93 */
- #ifdef NESTED_BOOLEANS
- if((number_of_operands > 1) && (search_result_array != NULL)) {
- #ifdef FIELDS /* tung, 1/94 */
- if(weight != NUMERIC_FLAG && weight != PARTIAL_FLAG) {
- #else
- if(weight != PARTIAL_FLAG) {
- #endif
- if(!make_doc_ids_array(operand_id, db->doc_table_allocated_entries))
- return(-1);
- search_result_array[operand_id].number_of_hits = number_of_valid_entries;
- }
- }
- #endif
- /* tung, 10/93 */
-
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- query_wgt = 1;
- #else
- /* ses - idf is a fist approximation to the inverse document freq. */
- /* what it actually is is the inverse occurance frequency which says
- * that the significance of a word is inversly proportional to the number
- * of times it occurs in the database */
-
- idf=1.0/number_of_occurances;
- #endif
- for(count=0;count < number_of_valid_entries;count++) {
- int wgt;
- int did;
- /*
- if(count%1024 == 0) {
- read(fileno(stream),index_buffer,INDEX_ELEMENT_SIZE*
- MINIMUM(1024,number_of_valid_entries-count));
- i=index_buffer;
- }
- */
- did = read_bytes(DOCUMENT_ID_SIZE, stream);
- (void)read_bytes(WORD_POSITION_SIZE, stream);
- txt_pos=read_bytes(CHARACTER_POSITION_SIZE, stream);
- wgt = read_bytes(WEIGHT_SIZE,stream);
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- internal_weight = read_weight_from_stream(NEW_WEIGHT_SIZE, stream);
- #endif
- /*
-
- ASSIGN(wgt,WEIGHT_SIZE,
- i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE,
- INDEX_ELEMENT_SIZE,
- DOCUMENT_ID_SIZE+WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE);
- ASSIGN(did,DOCUMENT_ID_SIZE,i,INDEX_ELEMENT_SIZE,0);
- */
- #ifdef LITERAL
- /* dgg -- is this proper update of read form to ASSIGN form ??*/
- /* txt_pos = read_bytes(CHARACTER_POSITION_SIZE, stream);*/ /* 2/92 GS TLG */
- if ((weight == LITERAL_FLAG) && (0 == doc_id)) {
- /*
- ASSIGN(txt_pos,CHARACTER_POSITION_SIZE,i+DOCUMENT_ID_SIZE+WORD_POSITION_SIZE,
- INDEX_ELEMENT_SIZE,DOCUMENT_ID_SIZE+WORD_POSITION_SIZE);
- */
- /* printf("search_word: txtpos=%d, wgt=%d, did=%d\n", txt_pos, wgt, did); */
- }
- #endif
-
- /* Commented out as suggested by Stan Isaacs at hp.com to come up with correct
- * weights when there are multiple documents in a file
- *
- * if(wgt>5L)
- * wgt-=5L;
- */
- #ifndef NEW_WEIGHT /* tung, 5/94 */
- internal_weight = log((double)wgt);
- internal_weight+=10.0;
- #endif
- internal_document_id = did;
- if((doc_start == 0) && (doc_end == 0)) /* tung, 5/94 */
- doc_start = doc_end = did; /* tung, 5/94 */
- doc_start = MINIMUM(doc_start, did); /* tung, 5/94 */
- doc_end = MAXIMUM(doc_end, did); /* tung, 5/94 */
-
- /*
- printf("entry %ld, Doc_id: %ld, weight %lf \n",
- count, internal_document_id, internal_weight);
- fflush(stdout);
- */
- if(EOF == wgt)
- {
- waislog(WLOG_HIGH, WLOG_ERROR,
- "reading from the doc-id table failed");
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif
- return(-1);
- }
-
- #ifdef LITERAL
- if ((weight == LITERAL_FLAG) && (0 == doc_id)) { /* 2/92 GS TLG */
- if (true == read_document_table_entry(&doc_entry, /* 2/92 GS TLG */
- internal_document_id, db)) /* 2/92 GS TLG */
- { /* 2/92 GS TLG */
- read_filename_table_entry(doc_entry.filename_id, /* 2/92 GS TLG */
- txt_filename, txt_type, NULL, db); /* 2/92 GS TLG */
- /* printf("search_word: document is [%s]\n", txt_filename); */
- if (NULL == txt_stream) {
- /* francois */
- if (probe_file(txt_filename)) {
- txt_stream = s_fopen(txt_filename, "rb");
- }
- else if (probe_file_possibly_compressed(txt_filename)) {
- temp_txt_filename = s_fzcat(txt_filename);
- if (temp_txt_filename) {
- txt_stream = s_fopen(temp_txt_filename, "rb");
- }
- }
-
- strcpy(prev_txt_filename, txt_filename);
- }
- else if (0 != strcmp(txt_filename, prev_txt_filename)) {
- s_fclose(txt_stream);
- /* francois */
- if ( temp_txt_filename != NULL ) {
- unlink(temp_txt_filename);
- s_free(temp_txt_filename);
- }
- if (probe_file(txt_filename)) {
- txt_stream = s_fopen(txt_filename, "rb");
- }
- else if (probe_file_possibly_compressed(txt_filename)) {
- temp_txt_filename = s_fzcat(txt_filename);
- if (temp_txt_filename) {
- txt_stream = s_fopen(temp_txt_filename, "rb");
- }
- }
- strcpy(prev_txt_filename, txt_filename); /* 2/92 GS TLG */
- }
-
- txt_pos += doc_entry.start_character - txt_pos_fix; /* dgg */
- document_section_len = doc_entry.end_character - txt_pos; /* tung, 10/93 */
- s_fseek(txt_stream, txt_pos, SEEK_SET); /* 2/92 GS TLG */
- document_section =
- (char*) s_malloc((size_t)((document_section_len+1)*sizeof(char))); /* tung, 10/93 */
- fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */
- phrase_readed = 0; /* tung, 10/93 */
- phrase_readed += strlen(document_section); /* tung, 10/93 */
- document_section = string_downcase(document_section); /* tung, 10/93 */
- #if 0
- fread(phrase, 1L, phraselen, txt_stream); /* 2/92 GS TLG */
- /* { phrase[phraselen]= '\0';
- printf("search_word: file phrase is [%s]\n", phrase);
- } */
- if (0 != strncasecmp(word, phrase, phraselen)) /* 2/92 GS TLG */
- internal_weight = 0.0; /* 2/92 GS TLG */
- #endif
- if (NULL == strstr(document_section, word)) { /* tung, 10/93 */
- while(phrase_readed < document_section_len) { /* tung, 10/93 */
- fgets(document_section, document_section_len, txt_stream); /* tung, 10/93 */
- phrase_readed += strlen(document_section); /* tung, 10/93 */
- document_section = string_downcase(document_section); /* tung, 10/93 */
- if(strstr(document_section, word) != NULL) { /* tung, 10/93 */
- phrase_found = true; /* tung, 10/93 */
- break; /* tung, 10/93 */
- } /* tung, 10/93 */
- } /* tung, 10/93 */
- if(phrase_found == false) /* tung, 10/93 */
- internal_weight = 0.0; /* tung, 10/93 */
- phrase_found = false; /* tung, 10/93 */
- }
- s_free(document_section); /* tung, 10/93 */
- }
- }
- #endif
-
- #ifndef NESTED_BOOLEANS /* 10,93 */
- #ifdef BOOLEANS
- if (gLastNot) {
- document_score_array[internal_document_id] = 0;
- /* printf("search_word: boolean 'not' scored\n"); */
- }
- else
- #endif
- #endif /* #ifndef NESTED_BOOLEANS */
- {
- /* if(doc_id > 0) we are doing a relevant document */
- /*
- printf("wgt: %ld, internal weight: %lf, idf: %lf occurances: %ld\n",
- wgt,internal_weight, idf,number_of_occurances);
- fflush(stdout);
- */
- #ifndef NEW_WEIGHT /* tung, 5/94 */
- internal_weight*=idf; /* ses - for inverse doc. freq. */
- #endif
- #ifndef NESTED_BOOLEANS
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- document_score_array[internal_document_id] +=
- (query_wgt * internal_weight);
- #else
- document_score_array[internal_document_id] +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- #endif
- #else
- /* tung, 10/93 */
- if(number_of_operands == 1) {
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- document_score_array[internal_document_id] +=
- (query_wgt * internal_weight);
- #else
- document_score_array[internal_document_id] +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- #endif
- }
- else {
- if((number_of_operands > 1) && (search_result_array != NULL)) {
- if(weight == LITERAL_FLAG) {
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- ((search_result_array[operand_id]).doc_ids_array[phrase_count]).score +=
- (query_wgt * internal_weight);
- #else
- ((search_result_array[operand_id]).doc_ids_array[phrase_count]).score +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- #endif
- if(((search_result_array[operand_id]).doc_ids_array[phrase_count]).score > 0) {
- ((search_result_array[operand_id]).doc_ids_array[phrase_count]).doc_id = internal_document_id;
- phrase_count++;
- search_result_array[operand_id].number_of_hits = phrase_count;
- }
- }
- #ifdef FIELDS /* tung, 1/94 */
- else if(weight == NUMERIC_FLAG || weight == PARTIAL_FLAG) {
- #else
- else if(weight == PARTIAL_FLAG) {
- #endif
- if(NumPart_score_array[internal_document_id] <= 0)
- ++numeric_partial_valid_entries;
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- NumPart_score_array[internal_document_id] =
- MAXIMUM(NumPart_score_array[internal_document_id], (query_wgt * internal_weight));
- #else
- NumPart_score_array[internal_document_id] +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- #endif
- }
- else {
- ((search_result_array[operand_id]).doc_ids_array[count]).doc_id = internal_document_id;
- #ifdef NEW_WEIGHT /* tung, 5/94 */
- ((search_result_array[operand_id]).doc_ids_array[count]).score +=
- (query_wgt * internal_weight);
- #else
- ((search_result_array[operand_id]).doc_ids_array[count]).score +=
- (doc_id) ? (internal_weight * RF_WEIGHTING) : internal_weight;
- #endif
- }
- }
- }
- #endif
- /* tung, 10/93 */
-
- }
- /*
- printf("Score array: %lf\n",document_score_array[internal_document_id]);
- fflush(stdout);
- */
-
- /* i+=INDEX_ELEMENT_SIZE; Purify (umr): uninitialized memory read: (up) */
- }
- }
-
- #ifdef PARTIALWORD
- #ifdef FIELDS /* tung, 1/94 */
- index_file_block_number =
- look_up_partialword_in_dictionary(field_name,
- NULL, &number_of_occurances, db);
- #else
- index_file_block_number =
- look_up_partialword_in_dictionary(NULL, &number_of_occurances, db);
- #endif
- }
- #endif
-
- #ifdef NESTED_BOOLEANS /* tung, 1/94 */
- if(number_of_operands > 1) {
- long index = 0;
- #ifdef FIELDS /* tung, 1/94 */
- if(weight == NUMERIC_FLAG || weight == PARTIAL_FLAG) {
- #else
- if(weight == PARTIAL_FLAG) {
- #endif
- if(!make_doc_ids_array(operand_id, db->doc_table_allocated_entries))
- return(-1);
- search_result_array[operand_id].number_of_hits =
- numeric_partial_valid_entries;
- /*for (count=0; count < db->doc_table_allocated_entries; count++) {*/
- for (count=doc_start; count <= doc_end ; count++) {
- if(NumPart_score_array[count] > 0) {
- ((search_result_array[operand_id]).doc_ids_array[index]).doc_id = count;
- ((search_result_array[operand_id]).doc_ids_array[index]).score
- = NumPart_score_array[count];
- NumPart_score_array[count] = 0.0;
- ++index;
- }
- if(index == numeric_partial_valid_entries)
- break;
- }
- }
- }
- #endif
-
- /* tung, 10/93 */
- #ifdef NESTED_BOOLEANS
- if((number_of_operands > 1) && (search_result_array != NULL)) {
- if(!save_operand_id(operand_id, search_result_array,db->doc_table_allocated_entries))
- return(-1);
- search_result_array[operand_id].operand_id = operand_id;
- ++operand_id;
- }
- #endif
- /* tung, 10/93 */
-
- #ifndef NESTED_BOOLEANS /* tung, 10/94 */
- #ifdef BOOLEANS
- for (count=0; count < db->doc_table_allocated_entries; count++) { /* 12/91 GS TLG */
- if (!gLastAnd) { /* 12/91 GS TLG */
- prev_score_array[count] = document_score_array[count]; /* 12/91 GS TLG */
- } /* 12/91 GS TLG */
- else { /* 12/91 GS TLG */
- if ((document_score_array[count] == prev_score_array[count]) /* 12/91 GS TLG */
- || (prev_score_array[count] == 0)) {
- document_score_array[count] = 0; /* 12/91 GS TLG */
- prev_score_array[count] = 0; /* 12/91 GS TLG */
- } /* 12/91 GS TLG */
- else {
- prev_score_array[count] = document_score_array[count]; /* 12/91 GS TLG */
- } /* 12/91 GS TLG */
- } /* 12/91 GS TLG */
- } /* 12/91 GS TLG */
- /* if (gLastAnd) printf("search_word: boolean `and' scored\n"); */
- #endif
- #endif
-
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif /* BOOLEANS */
- return(0);
- }
-
- else if(0 == index_file_block_number){
- /* an error occurred on looking up the word */
- #ifdef BOOLEANS
- gLastNot= gLastAnd= false;
- #endif
- return(-1);
- }
-
- else { /* index_file_block_number is negative */
- #ifdef NESTED_BOOLEANS /* tung, 10/93 */
- if((number_of_operands > 1) && (search_result_array != NULL)) {
- if(!save_operand_id(operand_id, search_result_array,db->doc_table_allocated_entries))
- return(-1);
- search_result_array[operand_id].operand_id = operand_id;
- search_result_array[operand_id].number_of_hits = 0;
- ++operand_id;
- }
- #else
- #ifdef BOOLEANS
- if (gLastAnd)
- for (count=0; count < db->doc_table_allocated_entries; count++) {
- document_score_array[count] = 0;
- prev_score_array[count] = 0;
- }
- gLastNot= gLastAnd= false;
- #endif
- #endif
- return(0); /* word not present */
- }
- }
-
-
- /* now collect the best hits */
- long finished_search_word(db)
- database *db;
- {
- #ifdef NESTED_BOOLEANS
- long number_of_hits; /* tung, 10/93 */
- #endif
-
- #ifdef BOOL
- if (currentQuery != NULL)
- return; /* do nothing for boolean */
- #endif /* def BOOL */
-
- /* tung, 10/93 */
- #ifdef NESTED_BOOLEANS
- if((number_of_operands > 1) && (search_result_array != NULL)) {
- number_of_hits = retriev_result(db->doc_table_allocated_entries,
- document_score_array);
- clear_search_result_array(&number_of_operands);
- }
- #endif
- /* tung, 10/93 */
-
- /* check the document_score_array */
- if(document_score_array_len < db->doc_table_allocated_entries)
- make_document_score_array(db->doc_table_allocated_entries);
-
- make_best_hits_array(max_hit_retrieved);
- sort_best_hits(db);
- syn_Free( db->syn_Table,&db->syn_Table_Size );
-
- return(0);
- }
-
-